#import libraries
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
import warnings
warnings.filterwarnings('ignore')
import plotly.graph_objects as go
import plotly.express as px
import seaborn as sn
import chart_studio.plotly as py
py.sign_in('szhang61', '0LCosRm6PReutj2D1R1w')
pd.set_option('display.precision',2)#set decimal show for two positions
Data source:
World happiness score is from kaggle:https://www.kaggle.com/mathurinache/world-happiness-report and
original from World Happiness Report: https://worldhappiness.report/
It's about 153 countries' happiness scores between 2015 to 2020, total 11665 data points.
#read in csv datasets
y2015 = pd.read_csv('2015.csv')
y2016 = pd.read_csv('2016.csv')
y2017 = pd.read_csv('2017.csv')
y2018 = pd.read_csv('2018.csv')
y2019 = pd.read_csv('2019.csv')
y2020 = pd.read_csv('2020.csv')
#check year 2015 data
y2015.head(5)
#delete uneeded column
del y2015['Standard Error']
#change columns name
y2015.rename(columns = {'Happiness Rank':'Hap_R','Happiness Score':'Hap_S',
'Economy (GDP per Capita)':'GDP_per_Cap','Family':'Social_sup',
'Health (Life Expectancy)':'Healthy_Life_Exp','Freedom':'Freedom_to_make_life_choi',
'Trust (Government Corruption)':'Corruption_Percep','Dystopia Residual':'Dystopia_R'},
inplace=True)
#add year column
y2015['Year']=2015
y2015.head()
#check year2016 data
y2016.head(5)
#delete uneeded columns
del y2016['Lower Confidence Interval'],y2016['Upper Confidence Interval']
#change columns names
y2016.rename(columns = {'Happiness Rank':'Hap_R','Happiness Score':'Hap_S',
'Economy (GDP per Capita)':'GDP_per_Cap','Family':'Social_sup',
'Health (Life Expectancy)':'Healthy_Life_Exp','Freedom':'Freedom_to_make_life_choi',
'Trust (Government Corruption)':'Corruption_Percep','Dystopia Residual':'Dystopia_R'},
inplace=True)
#add year column
y2016['Year']=2016
y2016.head()
#check year2017 data
y2017.head()
#delete uneeded data
del y2017['Whisker.high'],y2017['Whisker.low']
#change columns names
y2017.rename(columns = {'Happiness.Rank':'Hap_R','Happiness.Score':'Hap_S',
'Economy..GDP.per.Capita.':'GDP_per_Cap','Family':'Social_sup',
'Health..Life.Expectancy.':'Healthy_Life_Exp','Freedom':'Freedom_to_make_life_choi',
'Trust..Government.Corruption.':'Corruption_Percep','Dystopia.Residual':'Dystopia_R'},
inplace=True)
#add year column
y2017['Year']=2017
#create region dataframe
region = y2015.loc[:,['Country','Region']].set_index('Country')
#add region column
y2017 = y2017.set_index('Country').join(region).reset_index()
y2017.head()
#check year 2018 data
y2018.head()
#change column names
y2018.rename(columns = {'Overall rank':'Hap_R','Country or region':'Country','Score':'Hap_S',
'GDP per capita':'GDP_per_Cap','Social support':'Social_sup',
'Healthy life expectancy':'Healthy_Life_Exp',
'Freedom to make life choices':'Freedom_to_make_life_choi',
'Perceptions of corruption':'Corruption_Percep'},
inplace=True)
#add Dystopia column
y2018['Dystopia_R'] = y2018['Hap_S']-y2018['GDP_per_Cap']-y2018['Social_sup']-y2018['Healthy_Life_Exp']\
-y2018['Freedom_to_make_life_choi']-y2018['Generosity']-y2018['Corruption_Percep']
#add year column
y2018['Year']=2018
#add region column
y2018 = y2018.set_index('Country').join(region).reset_index()
y2018.head()
#check year2019
y2019.head()
#change column names
y2019.rename(columns = {'Overall rank':'Hap_R','Country or region':'Country','Score':'Hap_S',
'GDP per capita':'GDP_per_Cap','Social support':'Social_sup',
'Healthy life expectancy':'Healthy_Life_Exp',
'Freedom to make life choices':'Freedom_to_make_life_choi',
'Perceptions of corruption':'Corruption_Percep'},
inplace=True)
#add dysotopia column
y2019['Dystopia_R'] = y2019['Hap_S']-y2019['GDP_per_Cap']-y2019['Social_sup']-y2019['Healthy_Life_Exp']\
-y2019['Freedom_to_make_life_choi']-y2019['Generosity']-y2019['Corruption_Percep']
#add year column
y2019['Year']=2019
#add region column
y2019 = y2019.set_index('Country').join(region).reset_index()
y2019.head()
#check year 2020 data
y2020.head()
#drop uneeded columns
y2020 = y2020.drop(['Regional indicator','Standard error of ladder score','upperwhisker','lowerwhisker',
'Logged GDP per capita','Social support','Healthy life expectancy','Freedom to make life choices',
'Generosity','Perceptions of corruption','Ladder score in Dystopia'],axis=1)
#change columns name
y2020.rename(columns = {'Country name':'Country','Ladder score':'Hap_S',
'Explained by: Log GDP per capita':'GDP_per_Cap','Explained by: Social support':'Social_sup',
'Explained by: Healthy life expectancy':'Healthy_Life_Exp',
'Explained by: Freedom to make life choices':'Freedom_to_make_life_choi',
'Explained by: Perceptions of corruption':'Corruption_Percep',
'Dystopia + residual':'Dystopia_R','Explained by: Generosity':'Generosity'},
inplace=True)
#add year column
y2020['Year']=2020
#add region column
y2020 = y2020.set_index('Country').join(region).reset_index()
#add rank column
y2020['Hap_R']=y2020.index+1
y2020.head()
#read in GDP data data score from Word bank https://data.worldbank.org/indicator/NY.GDP.MKTP.CD
GDP = pd.read_excel("GDP.xls", skiprows=3)
#create dataframe for country code
code = GDP.loc[:,['Country Name','Country Code']]
#rename column names and set index
code = code.rename(columns = {'Country Name':'Country','Country Code':'Code'}).set_index(['Country'])
#create dataframe for 20 happy score and combine with country code
happy20 = y2020.loc[:,['Country','Hap_S']].set_index(['Country']).join(code)
happy20 = happy20.reset_index()
happy20.head()
#graph the map for 2020 happy score
fig = go.Figure(data=go.Choropleth(locations = happy20['Code'],z = happy20['Hap_S'],text = happy20['Country'],
autocolorscale=True,reversescale=True,
marker_line_color='darkgray',marker_line_width=0.5,
colorbar_title = 'Happiness Score',))
fig.update_layout(title_text='2020 Global Happyness Score',geo=dict(showframe=False,showcoastlines=False,
projection_type='equirectangular'),
annotations = [dict(x=0.55,y=0.1,xref='paper',yref='paper',
text='Source: <a href="https://worldhappiness.report/ed/2020/">\
World Happiness Report 2020</a>',showarrow = False)])
fig.show()
#concat all dataset into one
all_years = pd.concat([y2015,y2016,y2017,y2018,y2019,y2020])
all_years.head()
#plot happiness score for countries and color by region
fig = px.scatter(all_years.dropna(), x='Year', y='Hap_S', color='Region',
size='Hap_S',hover_name='Country', hover_data=['Hap_S'],
title='Happiness Score VS Years 2015-2020')
fig.show()
#creat correlation matrix
corrMatrix = y2020.corr()
#plot the correlation matrix
sn.heatmap(corrMatrix, annot=True)
plt.show()
#plot happiness score VS GDP per Capita
fig = px.scatter(all_years.dropna(), x='GDP_per_Cap', y='Hap_S', animation_frame='Year', animation_group='Country',
color='Region', hover_name='Country',title='Happiness Score VS GDP per Capita',
log_x=True, size_max=55, range_x=[0.1,2], range_y=[1,10])
fig.show()
#plot happiness score VS Social support
fig = px.scatter(all_years.dropna(), x='Social_sup', y='Hap_S', animation_frame='Year', animation_group='Country',
color='Region', hover_name='Country', title='Happiness Score VS Social Support',
log_x=True, size_max=55, range_x=[0.1,2], range_y=[1,10])
fig.show()
#plot VS happy VS Healthy life expctancy
fig = px.scatter(all_years.dropna(), x='Healthy_Life_Exp', y='Hap_S', animation_frame='Year', animation_group='Country',
color='Region', hover_name='Country', title='Happiness Score VS Healthy life expctancy',
log_x=True, size_max=55, range_x=[0.1,2], range_y=[1,10])
fig.show()
#create a new data frame that is indexed by year, region,country and also sorted
all_years_indexed = all_years.set_index(['Year','Region','Country']).sort_index()
all_years_indexed.head()
#create new dataframe for Sub_Saharan_Africa
Sub_Saharan_Africa = all_years_indexed.loc[2020,'Sub-Saharan Africa']
#creat correlation matrix
corrMatrix_SSA = Sub_Saharan_Africa.corr()
#plot the correlation matrix
sn.heatmap(corrMatrix_SSA, annot=True)
plt.show()
#create dataframe for top30 countries
top30 = all_years_indexed.loc[2020,:,:].sort_values(by='Hap_S', ascending=False)[0:30]
top30 = top30.reset_index()
#create bar chart for top30 countries
fig = px.bar(top30, x='Country', y=['GDP_per_Cap', 'Social_sup','Healthy_Life_Exp','Freedom_to_make_life_choi',
'Corruption_Percep','Generosity','Dystopia_R'],
title="Top30 Happy Countries")
fig.show()
#create dataframe for top30 happy countries and sort by gdp
top30_gdp = top30.sort_values(by='GDP_per_Cap', ascending=False)
#create bar chart for top30 happy countries and sort by gdp
fig = px.bar(top30_gdp, x='Country', y=['GDP_per_Cap', 'Social_sup','Healthy_Life_Exp','Freedom_to_make_life_choi',
'Corruption_Percep','Generosity','Dystopia_R'],
title='Top30 Happy Countries sort by GDP per Capita')
fig.show()
# check out mean value of all happiness indexs of top 30 countries
top_30 = top30.describe()
top30_mean =top_30.iloc[1:2,3:]
top30_mean = top30_mean.transpose().reset_index()
top30_mean
#create pie chart to show the percentage of each happiness index
fig = px.pie(top30_mean, values='mean', names='index', title='Top30 Happy Country Happiness Index % ')
fig.show()
#create datafarme for 2015
data15 = y2015.loc[:,['Country','Hap_S']].set_index('Country').rename(columns ={'Hap_S':2015})
#create dataframe for 2020
data20 = y2020.loc[:,['Country','Hap_S']].set_index('Country').rename(columns ={'Hap_S':2020})
#combine 2015 and 2020 dataframe
data = data15.join(data20)
#add new columns
data['Change'] = data[2020]-data[2015]
data['Change_in_percent'] = 100* (data[2020]-data[2015])/data[2015]
data = data.reset_index()
data = data.dropna()
data.head()
#create dataframe of top 10 change countries and sort
top_change = data.sort_values(by='Change', ascending=False)[0:10]
#create bar chart for top 10 change countries
fig = px.bar(top_change, x='Country', y='Change',
title='Top 10 countries 2015-2020 Happiness score change in fixed value')
fig.show()
#create dataframe of top 10 change in percentate countries and sort
top_change_p = data.sort_values(by='Change_in_percent', ascending=False)[0:10]
#create bar chart for top 10 change in percentage countries
fig = px.bar(top_change_p, x='Country', y='Change_in_percent',
title='Top 10 countries 2015-2020 Happiness score change in percentage %')
fig.show()
My learning processes are simple and follow by steps. First, I select a topic and do some research to find the datasets I need. Second, according to the topic, I come up with several related questions. Third, I make some adjustment to the questions that I have by observation of datasets. Then I start to clean the datasets and restructure them as well. Next, I use data analysis and visualization to find answer for my questions.
This project topic is about how can we be more happier? It's based on world happiness analysis and visualization from 2015 to 2020. By data mining, the data shows the countries have high happiness score are Finland,Iceland,Norway and etc. The regions have happiness score are Western Europe, Australia and newzland, and North America. The top three happiness index that has high correlation with happiness score is GDP per capita, social support and healthy life expectancy.. Through the study of the top 30 countries, It shows that the happy countries have higher social support index than GDP per capita index, which means economy is not the most important standard to become happy. People should build a good social support environment to increase their happiness level. For example make balance between work and life, spend more time with friends and family. Have a healthy lifestyle and maintain a creditable social system will also help with improve of happiness.
Zanodo: https://doi.org/10.5281/zenodo.4266415
Github.io: https://sz389.github.io/data_mining/